Installation

Include instructions for installing specific version of musicatk

If you are attempting to reproduce this analysis, you can 1) install/load renv, 2) copy/paste the renv.lock file into your current working directory and 3) run the renv::restore() command to automatically install all of the packages with the same version.

# Make sure the 'renv.lock' file is in your current working directory.
install.packages("renv")
library(renv)
renv::restore()

Load required libraries

library(musicatk)
library("TCGAbiolinks")
library("withr")

Download mutation calls for TCGA

tcga_datasets <- c("TCGA-LAML", "TCGA-ACC", "TCGA-BLCA", "TCGA-LGG", "TCGA-BRCA", "TCGA-CESC", "TCGA-CHOL", "TCGA-COAD", "TCGA-ESCA", "TCGA-GBM", "TCGA-HNSC", "TCGA-KICH", "TCGA-KIRC", "TCGA-KIRP", "TCGA-LIHC", "TCGA-LUAD", "TCGA-LUSC", "TCGA-DLBC", "TCGA-MESO", "TCGA-OV", "TCGA-PAAD", "TCGA-PCPG", "TCGA-PRAD", "TCGA-READ", "TCGA-SARC", "TCGA-SKCM", "TCGA-STAD", "TCGA-TGCT", "TCGA-THYM", "TCGA-THCA", "TCGA-UCS", "TCGA-UCEC", "TCGA-UVM")
types <- substring(tcga_datasets, 6)

dataset.list <- list()
annot <- list()
for(i in seq_along(tcga_datasets)) {
  query <- GDCquery(project = tcga_datasets[i], 
                  data.category = "Simple Nucleotide Variation", 
                  data.type = "Masked Somatic Mutation", 
                  workflow.type = "MuTect2 Variant Aggregation and Masking",
                  experimental.strategy = "WXS",
                  data.format = "maf")
  GDCdownload(query)
  data <- GDCprepare(query)
  dataset.list[[types[i]]] <- data.frame(extract_variants_from_matrix(data), tumor_type=types[i], stringsAsFactors = FALSE)
}

Create subplots for Figure 1

Select a subset of tumor types

select_tumor_types <- c("COAD", "LUAD", "SKCM", "UCEC")
dataset.select <- do.call(rbind, dataset.list[select_tumor_types])
annot.select <- unique(dataset.select[,c("sample", "tumor_type")])

Create musica object

g <- select_genome("38")
musica <- create_musica(dataset.select, g)
## Checking that chromosomes in the 'variant' object match chromosomes in the 'genome' object.
## Checking that the reference bases in the 'variant' object match the reference bases in the 'genome' object.
## Standardizing INS/DEL style
## Converting adjacent SBS into DBS
## 16044 SBS converted to DBS
matched_annot <- annot.select[match(samp_annot(musica)$Samples, annot.select[,1]), 2]
samp_annot(musica, "Tumor_Types") <- matched_annot
build_standard_table(musica, g, "SBS96")
## Building count table from SBS with SBS96 schema

Discover signatures

res = discover_signatures(musica = musica, table_name = "SBS96", 
                          num_signatures = 4, algorithm = "lda", 
                          seed = 12345, nstart = 1)

Plot exposures and signatures

# Plot exposures
plot_exposures(res, proportional = FALSE, sort_samples = "total", 
               num_samples = 150)

plot_exposures(res, proportional = TRUE, sort_samples = "Signature3")

plot_exposures(res, proportional = TRUE, num_samples = 8, sort_samples = "name")

# Plot signatures
plot_signatures(res)

# Plot exposures by signature
plot_exposures(result = res, proportional = TRUE, sort_samples = "total", 
               plot_type = "box", group_by = "signature", 
               annotation = "Tumor_Types", color_by = "annotation")

# Plot exposures by tumor type
plot_exposures(result = res, proportional = TRUE, sort_samples = "Signature4", 
               plot_type = "bar", group_by = "annotation", 
               annotation = "Tumor_Types", color_by = "signature")

Plot UMAP

with_seed(1, create_umap(res))
plot_umap(result = res, color_by = "annotation", annotation = "Tumor_Types")

plot_umap(res, "signatures")

Create heatmap

plot_heatmap(res_annot = res, proportional = TRUE, scale = TRUE, 
             annotation = "Tumor_Types")

Compare to COSMIC v2 Signatures

compare_cosmic_v2(res, threshold = 0.88)

##      cosine x_sig_index y_sig_index x_sig_name  y_sig_name
## 4 0.9966914           4          10 Signature4 Signature10
## 1 0.9951916           1           7 Signature1  Signature7
## 2 0.9766938           2           6 Signature2  Signature6
## 3 0.8843553           3           4 Signature3  Signature4

Perform clustering

clust <- cluster_exposure(result = res, nclust = 4, iter.max = 100, tol = 1e-14)
## Metric: 'euclidean'; comparing: 1963 vectors.
plot_umap(result = res, color_by = "cluster", annotation = "Tumor_Types", 
          clust = clust)

glm_stats <- exposure_differential_analysis(musica_result = res, 
                                            annotation = "Tumor_Types", 
                                            method = "glm")
plot_differential_analysis(glm_stats, "glm", 4)

TCGA PanCancer Analysis for Figure 2

Create musica object with all tumor types

dataset <- do.call(rbind, dataset.list)
annot <- unique(dataset[,c("sample", "tumor_type")])
g <- select_genome("38")
tcga <- create_musica(dataset, g)
## Checking that chromosomes in the 'variant' object match chromosomes in the 'genome' object.
## Checking that the reference bases in the 'variant' object match the reference bases in the 'genome' object.
## Standardizing INS/DEL style
## Converting adjacent SBS into DBS
## 24594 SBS converted to DBS
matched_annot <- annot[match(samp_annot(tcga)$Samples, annot[,1]), 2]
samp_annot(tcga, "Tumor_Types") <- matched_annot

Analyze SBS profiles

build_standard_table(tcga, g, "SBS96")
tcga_sbs_subset <- subset_musica_by_counts(tcga, "SBS96", 5)
tcga_v3_sbs <- with_seed(1, auto_predict_grid(musica = tcga_sbs_subset, 
              table_name = "SBS96", signature_res = cosmic_v3_sbs_sigs_exome, 
              algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_sbs))
plot_umap(result = tcga_v3_sbs, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)

plot_umap(tcga_v3_sbs, same_scale = FALSE)

Analyze DBS profiles

build_standard_table(tcga, g, "DBS78")
tcga_dbs_subset <- subset_musica_by_counts(tcga, "DBS78", 5)
tcga_v3_dbs <- with_seed(1, auto_predict_grid(musica = tcga_dbs_subset, 
              table_name = "DBS78", signature_res = cosmic_v3_dbs_sigs, 
              algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_dbs))
plot_umap(result = tcga_v3_dbs, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)

plot_umap(tcga_v3_dbs, same_scale = FALSE)

Analyze INDEL profiles

build_standard_table(tcga, g, "IND83")
tcga_ind_subset <- subset_musica_by_counts(musica = tcga, table_name = "IND83", num_counts = 5)
tcga_v3_ind <- with_seed(1, auto_predict_grid(musica = tcga_ind_subset, 
              table_name = "IND83", signature_res = cosmic_v3_indel_sigs, 
              algorithm = "lda", sample_annotation = "Tumor_Types"))
with_seed(1, create_umap(tcga_v3_ind))
plot_umap(result = tcga_v3_ind, proportional = TRUE, color_by = "annotation", annotation = "Tumor_Types", add_annotation_labels = TRUE, annotation_text_box = TRUE, annotation_label_size = 6, legend = FALSE, strip_axes = TRUE)

plot_umap(tcga_v3_ind, same_scale = FALSE, legend = FALSE, strip_axes = TRUE)

Session information

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.10_1/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] withr_2.4.1         TCGAbiolinks_2.18.0 musicatk_1.1.1     
##  [4] NMF_0.23.0          Biobase_2.50.0      BiocGenerics_0.36.0
##  [7] cluster_2.1.0       rngtools_1.5        pkgmaker_0.32.2    
## [10] registry_0.5-1     
## 
## loaded via a namespace (and not attached):
##   [1] circlize_0.4.12                         
##   [2] BiocFileCache_1.14.0                    
##   [3] plyr_1.8.6                              
##   [4] BiocParallel_1.24.1                     
##   [5] topicmodels_0.2-12                      
##   [6] GenomeInfoDb_1.26.2                     
##   [7] ggplot2_3.3.3                           
##   [8] gridBase_0.4-7                          
##   [9] digest_0.6.27                           
##  [10] foreach_1.5.1                           
##  [11] htmltools_0.5.1.1                       
##  [12] magrittr_2.0.1                          
##  [13] memoise_2.0.0                           
##  [14] BSgenome_1.58.0                         
##  [15] tm_0.7-8                                
##  [16] doParallel_1.0.16                       
##  [17] ComplexHeatmap_2.6.2                    
##  [18] Biostrings_2.58.0                       
##  [19] readr_1.4.0                             
##  [20] matrixStats_0.58.0                      
##  [21] BSgenome.Hsapiens.UCSC.hg38_1.4.3       
##  [22] R.utils_2.10.1                          
##  [23] askpass_1.1                             
##  [24] prettyunits_1.1.1                       
##  [25] colorspace_2.0-0                        
##  [26] ggrepel_0.9.1                           
##  [27] blob_1.2.1                              
##  [28] rvest_0.3.6                             
##  [29] rappdirs_0.3.3                          
##  [30] xfun_0.21                               
##  [31] dplyr_1.0.4                             
##  [32] crayon_1.4.1                            
##  [33] RCurl_1.98-1.2                          
##  [34] jsonlite_1.7.2                          
##  [35] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 
##  [36] iterators_1.0.13                        
##  [37] glue_1.4.2                              
##  [38] gtable_0.3.0                            
##  [39] zlibbioc_1.36.0                         
##  [40] XVector_0.30.0                          
##  [41] GetoptLong_1.0.5                        
##  [42] DelayedArray_0.16.1                     
##  [43] TxDb.Hsapiens.UCSC.hg38.knownGene_3.10.0
##  [44] shape_1.4.5                             
##  [45] scales_1.1.1                            
##  [46] DBI_1.1.1                               
##  [47] Rcpp_1.0.6                              
##  [48] xtable_1.8-4                            
##  [49] progress_1.2.2                          
##  [50] clue_0.3-58                             
##  [51] bit_4.0.4                               
##  [52] matrixTests_0.1.9                       
##  [53] stats4_4.0.2                            
##  [54] httr_1.4.2                              
##  [55] RColorBrewer_1.1-2                      
##  [56] ellipsis_0.3.1                          
##  [57] modeltools_0.2-23                       
##  [58] pkgconfig_2.0.3                         
##  [59] XML_3.99-0.5                            
##  [60] R.methodsS3_1.8.1                       
##  [61] farver_2.0.3                            
##  [62] uwot_0.1.10                             
##  [63] dbplyr_2.1.0                            
##  [64] tidyselect_1.1.0                        
##  [65] labeling_0.4.2                          
##  [66] rlang_0.4.10                            
##  [67] reshape2_1.4.4                          
##  [68] AnnotationDbi_1.52.0                    
##  [69] munsell_0.5.0                           
##  [70] tools_4.0.2                             
##  [71] cachem_1.0.4                            
##  [72] downloader_0.4                          
##  [73] generics_0.1.0                          
##  [74] RSQLite_2.2.3                           
##  [75] evaluate_0.14                           
##  [76] stringr_1.4.0                           
##  [77] fastmap_1.1.0                           
##  [78] yaml_2.2.1                              
##  [79] knitr_1.31                              
##  [80] bit64_4.0.5                             
##  [81] purrr_0.3.4                             
##  [82] slam_0.1-48                             
##  [83] R.oo_1.24.0                             
##  [84] xml2_1.3.2                              
##  [85] biomaRt_2.46.3                          
##  [86] compiler_4.0.2                          
##  [87] curl_4.3                                
##  [88] png_0.1-7                               
##  [89] tibble_3.0.6                            
##  [90] stringi_1.5.3                           
##  [91] highr_0.8                               
##  [92] TCGAbiolinksGUI.data_1.10.0             
##  [93] GenomicFeatures_1.42.1                  
##  [94] RSpectra_0.16-0                         
##  [95] lattice_0.20-41                         
##  [96] Matrix_1.3-2                            
##  [97] vctrs_0.3.6                             
##  [98] pillar_1.4.7                            
##  [99] lifecycle_1.0.0                         
## [100] combinat_0.0-8                          
## [101] GlobalOptions_0.1.2                     
## [102] RcppAnnoy_0.0.18                        
## [103] data.table_1.13.6                       
## [104] cowplot_1.1.1                           
## [105] bitops_1.0-6                            
## [106] rtracklayer_1.50.0                      
## [107] GenomicRanges_1.42.0                    
## [108] R6_2.5.0                                
## [109] gridExtra_2.3                           
## [110] IRanges_2.24.1                          
## [111] codetools_0.2-18                        
## [112] MCMCprecision_0.4.0                     
## [113] MASS_7.3-53                             
## [114] gtools_3.8.2                            
## [115] assertthat_0.2.1                        
## [116] philentropy_0.4.0                       
## [117] SummarizedExperiment_1.20.0             
## [118] openssl_1.4.3                           
## [119] rjson_0.2.20                            
## [120] GenomicAlignments_1.26.0                
## [121] Rsamtools_2.6.0                         
## [122] S4Vectors_0.28.1                        
## [123] GenomeInfoDbData_1.2.4                  
## [124] hms_1.0.0                               
## [125] grid_4.0.2                              
## [126] tidyr_1.1.2                             
## [127] rmarkdown_2.6                           
## [128] MatrixGenerics_1.2.1                    
## [129] Cairo_1.5-12.2                          
## [130] NLP_0.2-1